In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

print(__version__) # requires version >= 1.9.0
2.7.0
In [3]:
import cufflinks as cf
In [4]:
# For Notebooks
init_notebook_mode(connected=True)
In [5]:
# For offline use
cf.go_offline()
In [6]:
df = pd.read_csv('WorldCups.csv')
df1 = pd.read_csv('WorldCupMatches.csv')
df2 = pd.read_csv('WorldCupPlayers.csv')

OVERALL WORLD CUP DATAFRAME

In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
Year              20 non-null int64
Country           20 non-null object
Winner            20 non-null object
Runners-Up        20 non-null object
Third             20 non-null object
Fourth            20 non-null object
GoalsScored       20 non-null int64
QualifiedTeams    20 non-null int64
MatchesPlayed     20 non-null int64
Attendance        20 non-null object
dtypes: int64(4), object(6)
memory usage: 1.6+ KB
In [8]:
#Exploring data
df.head(20)
Out[8]:
Year Country Winner Runners-Up Third Fourth GoalsScored QualifiedTeams MatchesPlayed Attendance
0 1930 Uruguay Uruguay Argentina USA Yugoslavia 70 13 18 590.549
1 1934 Italy Italy Czechoslovakia Germany Austria 70 16 17 363.000
2 1938 France Italy Hungary Brazil Sweden 84 15 18 375.700
3 1950 Brazil Uruguay Brazil Sweden Spain 88 13 22 1.045.246
4 1954 Switzerland Germany FR Hungary Austria Uruguay 140 16 26 768.607
5 1958 Sweden Brazil Sweden France Germany FR 126 16 35 819.810
6 1962 Chile Brazil Czechoslovakia Chile Yugoslavia 89 16 32 893.172
7 1966 England England Germany FR Portugal Soviet Union 89 16 32 1.563.135
8 1970 Mexico Brazil Italy Germany FR Uruguay 95 16 32 1.603.975
9 1974 Germany Germany FR Netherlands Poland Brazil 97 16 38 1.865.753
10 1978 Argentina Argentina Netherlands Brazil Italy 102 16 38 1.545.791
11 1982 Spain Italy Germany FR Poland France 146 24 52 2.109.723
12 1986 Mexico Argentina Germany FR France Belgium 132 24 52 2.394.031
13 1990 Italy Germany FR Argentina Italy England 115 24 52 2.516.215
14 1994 USA Brazil Italy Sweden Bulgaria 141 24 52 3.587.538
15 1998 France France Brazil Croatia Netherlands 171 32 64 2.785.100
16 2002 Korea/Japan Brazil Germany Turkey Korea Republic 161 32 64 2.705.197
17 2006 Germany Italy France Germany Portugal 147 32 64 3.359.439
18 2010 South Africa Spain Netherlands Germany Uruguay 145 32 64 3.178.856
19 2014 Brazil Germany Argentina Netherlands Brazil 171 32 64 3.386.810
In [9]:
df['Attendance'][0:3].astype(float).mean() #Only 0 to 3 work because they can be converted to floats (they only have 1 decimal)
Out[9]:
443.083
In [10]:
#We have run into problem of there being periods in the data instead of commas which prevents you from doing anything with
#those numbers. The fix would be a loop through that columns which takes the current value and reads the number, any time
#a period is found, replace it with a comma
In [11]:
#Before writing a lambda expression to change the data, lets see if I can graph it and maybe have better insight
# sns.distplot(df['Attendance'])

#This verified there is a problem with how the original data is formatted. Need to clean this column.
In [12]:
def delete_periods(string):
    string = string.replace(".", "")
    return int(string)
    
df['Attendance'] = df['Attendance'].apply(lambda x: delete_periods(x))
In [13]:
#Testing if all numbers are now valid. Since function works and average makes sense, it is now valid and ready for analysis
df['Attendance'].mean()
Out[13]:
1872882.35
In [14]:
#Graphing World Cup Attendance
plt.style.use('seaborn-darkgrid')
graph = df.plot.line(x='Year',y='Attendance',figsize=(16,8),lw=3)
graph.set_title("Number of World Cup Attendees");
graph.set_xlabel("Year")
Out[14]:
Text(0.5,0,'Year')
In [15]:
#Insight: Attendance has increased over the years with the most attended world cup being in 1994, no other has been
#as attended since then
In [16]:
#Now I start looking to see which country has won the most
In [17]:
plt.style.use('fivethirtyeight')
graph = df['Winner'].hist(bins=20,figsize=(16,6))
graph.set_title("Number of World Cup Wins");
graph.set_ylabel("Games Won")
graph
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e2a1c6aa58>
In [18]:
plt.style.use('fivethirtyeight')
graph = df['Runners-Up'].hist(bins=30,figsize=(16,6))
graph.set_title("Number of times World Cup Runners-Up");
graph.set_ylabel("Number of times")
graph
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e2a1e03ba8>
In [19]:
plt.style.use('seaborn-muted')
graph = df['Third'].hist(bins=35,figsize=(20,6))
graph.set_title("Number of times World Cup Third Place");
graph.set_ylabel("Number of times")
graph
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e2a1ebe518>
In [20]:
plt.style.use('seaborn-pastel')
graph = df['Fourth'].hist(bins=35,figsize=(25,6))
graph.set_title("Number of times World Cup Fourth Place");
graph.set_ylabel("Number of times")
graph
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e2a2189438>
In [21]:
#Aggregating top 3 placements
df_first = df['Winner']
df_second = df['Runners-Up']
df_third = df['Third']

frames = [df_first,df_second,df_third]
result = pd.concat(frames)

plt.style.use('seaborn-pastel')
graph = result.hist(bins=40,figsize=(35,10))
graph.set_title("Number of World Cup Top 3 Placements");
graph.set_ylabel("Number of times")
#plt.savefig('img/top3.png', bbox_inches='tight')
Out[21]:
Text(0,0.5,'Number of times')
In [22]:
#Graphing Number of Goals Scored
plt.style.use('seaborn-poster')
graph = df.plot.line(x='Year',y='GoalsScored',figsize=(16,8),lw=3)
graph.set_title("Number of Goals Scored");
graph.set_xlabel("Year")
Out[22]:
Text(0.5,0,'Year')
In [23]:
#Graphing Changes in World Cup tournament

plt.style.use('seaborn-deep')
plt.figure(figsize=(16,8))
plt.plot( 'Year', 'QualifiedTeams', data=df, linewidth=2,label="Number of Teams")
plt.plot( 'Year', 'MatchesPlayed', data=df, linewidth=2,label="Number of Matches Played")
plt.title("Changes in World Cup");
plt.legend()



#df.plot(x='Year',y='QualifiedTeams')
#df.plot(x='Year',y='MatchesPlayed')
#plt.show()
Out[23]:
<matplotlib.legend.Legend at 0x1e2a2602898>
In [24]:
#Insight: On the surface it seems that the number of goals increases as time went on, most likely because we are becoming
#more powerful super humans. Unfortunately that isn't the case and it is explained by the changes to the tournament
#increase of number of teams and matches played

MATCHES DATA FRAME

In [25]:
df1.info() #Matches
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 20 columns):
Year                    852 non-null float64
Datetime                852 non-null object
Stage                   852 non-null object
Stadium                 852 non-null object
City                    852 non-null object
Home Team Name          852 non-null object
Home Team Goals         852 non-null float64
Away Team Goals         852 non-null float64
Away Team Name          852 non-null object
Win conditions          852 non-null object
Attendance              850 non-null float64
Half-time Home Goals    852 non-null float64
Half-time Away Goals    852 non-null float64
Referee                 852 non-null object
Assistant 1             852 non-null object
Assistant 2             852 non-null object
RoundID                 852 non-null float64
MatchID                 852 non-null float64
Home Team Initials      852 non-null object
Away Team Initials      852 non-null object
dtypes: float64(8), object(12)
memory usage: 714.5+ KB
In [26]:
#Question to answer: Whether more goals were scored after half-time?
In [27]:
df1.head()
Out[27]:
Year Datetime Stage Stadium City Home Team Name Home Team Goals Away Team Goals Away Team Name Win conditions Attendance Half-time Home Goals Half-time Away Goals Referee Assistant 1 Assistant 2 RoundID MatchID Home Team Initials Away Team Initials
0 1930.0 13 Jul 1930 - 15:00 Group 1 Pocitos Montevideo France 4.0 1.0 Mexico 4444.0 3.0 0.0 LOMBARDI Domingo (URU) CRISTOPHE Henry (BEL) REGO Gilberto (BRA) 201.0 1096.0 FRA MEX
1 1930.0 13 Jul 1930 - 15:00 Group 4 Parque Central Montevideo USA 3.0 0.0 Belgium 18346.0 2.0 0.0 MACIAS Jose (ARG) MATEUCCI Francisco (URU) WARNKEN Alberto (CHI) 201.0 1090.0 USA BEL
2 1930.0 14 Jul 1930 - 12:45 Group 2 Parque Central Montevideo Yugoslavia 2.0 1.0 Brazil 24059.0 2.0 0.0 TEJADA Anibal (URU) VALLARINO Ricardo (URU) BALWAY Thomas (FRA) 201.0 1093.0 YUG BRA
3 1930.0 14 Jul 1930 - 14:50 Group 3 Pocitos Montevideo Romania 3.0 1.0 Peru 2549.0 1.0 0.0 WARNKEN Alberto (CHI) LANGENUS Jean (BEL) MATEUCCI Francisco (URU) 201.0 1098.0 ROU PER
4 1930.0 15 Jul 1930 - 16:00 Group 1 Parque Central Montevideo Argentina 1.0 0.0 France 23409.0 0.0 0.0 REGO Gilberto (BRA) SAUCEDO Ulises (BOL) RADULESCU Constantin (ROU) 201.0 1085.0 ARG FRA
In [28]:
ht_goals = df1['Home Team Goals'].sum() #home team goals total
In [29]:
at_goals = df1['Away Team Goals'].sum() #away team goals total
In [30]:
htht_goals = df1['Half-time Home Goals'].sum() #before half time home team goals
In [31]:
htat_goals = df1['Half-time Away Goals'].sum() #before half time away team goals
In [32]:
ahtht_goals = ht_goals - htht_goals
ahtht_goals #after half time home team goals
Out[32]:
939.0
In [33]:
ahtat_goals = at_goals - htat_goals
ahtat_goals #after half time away team goals
Out[33]:
506.0
In [34]:
#Creating a dataframe of the individual metrics

goalframe = pd.DataFrame(data={'Before Half-Time': [htht_goals, htat_goals], 'After Half-Time': [ahtht_goals, ahtat_goals]}, columns=['Before Half-Time','After Half-Time'], index=['Home','Away'])
goalframe
Out[34]:
Before Half-Time After Half-Time
Home 604.0 939.0
Away 365.0 506.0
In [35]:
goalframe.iplot(kind='bar', barmode='stack', filename='cufflinks/grouped-bar-chart', title="Total World Cup Goals", yTitle="Goals Scored")

PLAYERS DATAFRAME

In [36]:
df2.info() #Players
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 9 columns):
RoundID          37784 non-null int64
MatchID          37784 non-null int64
Team Initials    37784 non-null object
Coach Name       37784 non-null object
Line-up          37784 non-null object
Shirt Number     37784 non-null int64
Player Name      37784 non-null object
Position         4143 non-null object
Event            9069 non-null object
dtypes: int64(3), object(6)
memory usage: 2.6+ MB
In [37]:
df2.head()
Out[37]:
RoundID MatchID Team Initials Coach Name Line-up Shirt Number Player Name Position Event
0 201 1096 FRA CAUDRON Raoul (FRA) S 0 Alex THEPOT GK NaN
1 201 1096 MEX LUQUE Juan (MEX) S 0 Oscar BONFIGLIO GK NaN
2 201 1096 FRA CAUDRON Raoul (FRA) S 0 Marcel LANGILLER NaN G40'
3 201 1096 MEX LUQUE Juan (MEX) S 0 Juan CARRENO NaN G70'
4 201 1096 FRA CAUDRON Raoul (FRA) S 0 Ernest LIBERATI NaN NaN